clear all

use ../import/basic_variables.dta, clear
merge m:1 CASEID_1979 using ../processed/college79.dta
assert _merge==3
drop _merge

// merge with previous year's CPI (for deflating family income
replace year = year-1 
gen year_income = year
merge m:1 year using ../import/cpi.dta, keep(1 3)
assert _merge==3
drop _merge
rename cpi cpi_income
replace year = year+1

merge m:1 year using ../import/cpi.dta, keep(1 3)
assert _merge==3
drop _merge
	
tab year, m

// sample selection
keep if SAMPLE_ID_1979<=14 // civilian sample

// sampleing weight for civilian sample replicates population
tab Q1_3_A_Y_1979 SAMPLE_RACE_78SCRN if SAMPLE_SEX_1979==1 & year==1979 [fw=SAMPWEIGHT_1979]
tab Q1_3_A_Y_1979 SAMPLE_RACE_78SCRN if SAMPLE_SEX_1979==2 & year==1979 [fw=SAMPWEIGHT_1979]

// valid AFQT (AFQT-3)
gen AFQT_na = (AFQT_3_1981<=-4)
gen AFQT3 = AFQT_3_1981/10^5 if AFQT_na==0 
sum AFQT3 [aw=SAMPWEIGHT_1979], d
tab AFQT_na SAMPLE_SEX_1979 if year==1979, m

// valid location record
tab loc_um SAMPLE_SEX_1979 if year==1979 & AFQT_na==0, m

// hgc cumulative
sort CASEID_1979 year
gen t1 = _n
egen t2 = min(t1), by(CASEID_1979)
gen intN = t1-t2+1
drop t1 t2
tsset CASEID_1979 intN
gen hgc_cum = .
replace hgc_cum = cond(hgc>L.hgc_cum|year==1979,hgc,L.hgc_cum)
tab hgc_cum year, m
gen hgc_r = cond(hgc_cum==19|hgc_cum==20,18,hgc_cum)

gen yob = 1900+Q1_3_A_Y_1979
tab yob, m
gen age = year-yob

gen temp = (age<=22 & hgc>=8) // at least 8th grade until age 22 (oldest cohort is 22 yo in 1979)
egen hgc8 = max(temp), by(CASEID_1979)
tab hgc8 SAMPLE_SEX_1979 if year==1979 & AFQT_na==0 & loc_um==0, m
drop temp

// person-year observation selection
gen Dint = intM!=-5 // interviewed
gen Dage = (age>=25 & age<=54) // age 25-54
gen Denrl = (enrl==1|enrl==4|year>=2008) // not enrolled (no report in year 2008 or later)
gen Dhrp = (hrp>=0) // nonmissing wage info

local cond `"loc_um==0 & AFQT_na==0 & hgc8==1"'
local cond_any `"loc_um==0 & AFQT_na==0 & hgc8==1"' 
foreach v in int age enrl hrp{
	tab year D`v' if `cond' & SAMPLE_SEX_1979==1, m
	tab year D`v' if `cond' & SAMPLE_SEX_1979==2, m
	local cond `"`cond' & D`v'==1"'
	gen temp = (`cond') 
	egen D`v'_any = max(temp), by(CASEID_1979)
	tab D`v'_any SAMPLE_SEX_1979 if `cond_any' & year==1979, m
	local cond_any `"`cond_any' & D`v'_any==1"'
	drop temp
}
gen smpl = (`cond')
tab age smpl, m
tab year smpl, m
egen Nobs = sum(smpl), by(CASEID_1979)
gen wgt = SAMPWEIGHT_1979/Nobs

egen hgc_max = max(hgc), by(CASEID_1979)
egen avg_age = mean(age) if smpl==1, by(CASEID_1979)
tabstat avg_age if smpl==1 & SAMPLE_SEX_1979==1 [aw=wgt], stat(mean) by(hgc_max)
tabstat avg_age if smpl==1 & SAMPLE_SEX_1979==2 [aw=wgt], stat(mean) by(hgc_max)

tab hgc hgc_cum if smpl==1, m

gen deg_nn = hdr==0
gen deg_hg = hdr==1
gen deg_aa = hdr==2
gen deg_ba = (hdr==3|hdr==4)
gen deg_ma = (hdr==5)
gen deg_pd = (hdr==6) // PhD
gen deg_pf = (hdr==7) // Professional
gen deg_ot = (hdr==8)

foreach v in nn hg aa ba ma pd pf ot{
	egen deg_`v'_max = max(deg_`v'), by(CASEID_1979)
	tab hgc_max if `cond_any' & year==1979 & deg_`v'_max==1, m
}

gen hrp_r = 0.01*hrp/(cpi/100) if Dhrp==1 // hourly rate of pay in dollar (inflation adjusted using CPI-U; 1982-84=100)
tabstat hrp_r, by(year) stat(mean sd min p1 p10 q p90 p99 max N) col(stat)

gen logwage = log( max(min(hrp_r,100),1) ) if Dhrp==1

tabstat logwage hgc_cum age if smpl==1 & SAMPLE_SEX_1979==1 [aw=wgt], stat(mean sd min q max) col(stat)
tabstat logwage hgc_cum age if smpl==1 & SAMPLE_SEX_1979==2 [aw=wgt], stat(mean sd min q max) col(stat)

gen hsp = SAMPLE_RACE_78SCRN==1
gen blk = SAMPLE_RACE_78SCRN==2
recode HGC_MOTHER_1979 (-4/-1=0 "missing") (0/8=1 "8th grade or less") (9/11=2 "some high school") ///
(12=3 "high school graduate") (13/15=4 "some college") (16/20=5 "college graduate"), gen(hgc_m)
recode HGC_FATHER_1979 (-4/-1=0 "missing") (0/8=1 "8th grade or less") (9/11=2 "some high school") ///
(12=3 "high school graduate") (13/15=4 "some college") (16/20=5 "college graduate"), gen(hgc_f)
tab HGC_MOTHER_1979 hgc_m if year==1979, m 
tab HGC_FATHER_1979 hgc_f if year==1979, m

recode FAM_28A_1979 (-3/0 = 0 "unsure or zero")(6/29=6 "more than 5"), gen(nsibs)
tab nsibs SAMPLE_SEX_1979 if `cond_any' & year==1979, m

gen urban = uafrac>=0.5 // county belongs to urbanized area

// family income
gen avl_income = (year_income-yob<=17 & netfi>=0)
tab year avl_income, m
sum netfi if avl_income==1, d

gen t1 = netfi*avl_income*(100/cpi_income)
egen t2 = sum(t1), by(CASEID_1979)
egen t3 = sum(avl_income), by(CASEID_1979)
tab yob t3 if `cond_any' & year==1979, m
gen finc = (t2/t3)/1000
sum finc if `cond_any' & year==1979, d
drop t1 t2 t3

egen t1 = sum(netfi_pct*avl_income), by(CASEID_1979)
egen t2 = sum(avl_income), by(CASEID_1979)
gen finc_pct = (t1/t2)
sum finc_pct if `cond_any' & year==1979, d
drop t1 t2


local vlist hsp blk pub2 pub4 urban
forvalues j=0/5{
	gen hgc_m`j' = hgc_m==`j'
	gen hgc_f`j' = hgc_f==`j'
	local vlist `vlist' hgc_m`j' hgc_f`j'
}

tabstat `vlist' if `cond_any' & year==1979 [aw=SAMPWEIGHT_1979], by(SAMPLE_SEX_1979) stat(mean) col(var) format(%7.4f)

gen ageL = age-25
gen ageQ = (age-25)^2
gen ageC = (age-25)^3
gen tui2 = tui2_c/10^3 *(100/60.6) // 1977 current dollar to CPI-U dollar
gen tui4 = tui4_c/10^3 *(100/60.6) // 1977 current dollar to CPI-U dollar

gen locU17 = .
gen locU = 0
forvalues y=1974/1981{
	replace locU17 = urate`y' if yob+17==`y'
	replace locU = locU + urate`y'/8
}
gen locE17 = .
gen locE = 0
forvalues y=1974/1981{
	replace locE17 = log(earn_pe`y') if yob+17==`y'
	replace locE = locE + log(earn_pe`y')/8
}

tabstat AFQT3 nsibs tui2 tui4 dist2 dist4 locU locE locE17 locU17 if `cond_any' & SAMPLE_SEX_1979==1 & year==1979 [aw=SAMPWEIGHT_1979], stat(mean sd min q max) col(stat)
tabstat AFQT3 nsibs tui2 tui4 dist2 dist4 locU locE locE17 locU17 if `cond_any' & SAMPLE_SEX_1979==2 & year==1979 [aw=SAMPWEIGHT_1979], stat(mean sd min q max) col(stat)

gen AFQT3_q = AFQT3^2

rename SAMPWEIGHT_1979 wgt_cc
rename SAMPLE_SEX_1979 sex
gen cs_smpl = (SAMPLE_ID_1979<=8)
tab SAMPLE_ID_1979 cs_smpl, m
keep if smpl==1
local vlist CASEID_1979 year wgt sex cs_smpl wgt_cc logwage hgc_r hgc_cum AFQT3 blk hsp hgc_m hgc_f nsibs yob age ///
divisiona urban locU locE pub2 dist2 tui2 pub4 dist4 tui4 locU17 locE17 cty_grp
sum `vlist'
foreach v in `vlist'{
	assert !missing(`v')
}
keep `vlist' finc finc_pct
order `vlist'
sort CASEID_1979 year
compress
save ../processed/NLSY79.dta, replace
